import pandas as pd # Dataframe manipulation
import numpy as np # Mathematics operation
from datetime import datetime # Date time manipulation
import matplotlib.pyplot as plt # Data visualization
from sklearn import preprocessing # Data preprocessing
import re # Regular Expression
data_lemon = pd.read_csv('Lemonade.csv')
data_lemon.head() # Show first data
data_lemon.info() # Data structure
data_lemon.describe() # Summary of statistics
# Summary of Statistics each categories
data_lemon.groupby(['Day'],as_index = False).describe()
data_lemon['Date'].head()
data_lemon['Date'] = pd.to_datetime(data_lemon['Date'],format = '%d/%m/%Y')
data_lemon['Date'].head()
print(data_lemon.columns.values) # Column's name
data_lemon.columns = ['Tanggal','Hari','Suhu','Intensitas Hujan','Leaflet','Harga','Penjualan']
data_lemon.rename(columns = {'Intensitas Hujan':'Rainfall'},inplace = True)
data_lemon['Pendapatan'] = data_lemon['Harga'] * data_lemon['Penjualan'] # Column 'Pendapatan'
data_lemon[['Pendapatan', 'Harga', 'Penjualan']].sample(n = 5)
data_lemon.head() # Show first data
# Make column 'Month' from 'Date' and input it in specific column
new_col = data_lemon['Tanggal'].dt.month
data_lemon.insert(loc = 1, column = 'Bulan', value = new_col)
data_lemon.head() # Show first data
cats_month = {1:'January',2:'February',3:'March',4:'April',5:'May',6:'June',
7:'July',8:'August',9:'September',10:'October',11:'November',12:'December'}
data_lemon['Bulan'] = data_lemon['Bulan'].replace(to_replace = cats_month.keys(),value = cats_month.values())
print(data_lemon['Bulan'].unique())
data_lemon.head() # Show first data
print('Number unique value:',data_lemon['Hari'].nunique()) # Number unique value
print('Unique value of Hari: \n', data_lemon['Hari'].unique()) # Unique value column 'Hari'
for i in data_lemon.columns.values: # Count missing value each columns
print(i,':',data_lemon[i].isnull().sum())
data_agg = data_lemon.copy() # Copy the data
data_agg.groupby(['Hari'],as_index = False)['Penjualan','Pendapatan'].agg('sum')
# Specific aggregation function
df_agg = data_agg.groupby(['Hari'],as_index = False).agg({'Suhu':'mean','Rainfall':'mean','Leaflet':'sum',
'Harga':'mean','Penjualan':'sum','Pendapatan':'sum'})
df_agg
# Sort specific value
cats = ['Monday','Tuesday','Wednesday','Thursday','Friday','Saturday','Sunday']
df_agg['Hari'] = pd.Categorical(df_agg['Hari'],categories = cats,ordered = True)
df_agg.sort_values('Hari',ascending = True)
data_agg.groupby(['Bulan'],as_index = False)['Penjualan','Pendapatan'].agg('sum')
# Specific aggregation function
df_agg = data_agg.groupby(['Hari'],as_index = False).agg({'Suhu':'mean','Rainfall':'mean','Leaflet':'sum',
'Harga':'mean','Penjualan':'sum','Pendapatan':'sum'})
df_agg.reindex_axis([1,5,6,4,0,2,3]) # Reindex
df_cross = pd.crosstab(data_lemon.Bulan,data_lemon.Hari) # Add 'margin = True' to show total
df_cross
print(df_cross.columns.values) # Column's name
df_cross.sort_index(inplace = True)
# Order column based on list cats and index by cats_month
df_cross[cats].reindex(cats_month.values())
# Crete duplcate data for applying standarization
data_std = data_lemon.copy()
print(data_std.columns.values) # Column's names
# Create a min max processing object
min_max_scaler = preprocessing.MinMaxScaler()
# Convert the column value of the dataframe as floats
data_std['Temperature'] = data_std['Temperature'].values.astype(float)
data_std['Temperature'] = min_max_scaler.fit_transform(data_std['Temperature'])
data_std['Temperature'].describe() # Summary of statistics
data_std['Rainfall'] = (data_std['Rainfall'] - data_std['Rainfall'].mean())/data_std['Rainfall'].std()
data_std['Rainfall'].describe() # Summary of statistics
data_filter = data_lemon.copy()
data_filter.head()
data_filter.info()
data_filter.set_index('Tanggal',inplace = True) # Inplace means we set index permanently to daata_filter
data_filter.head()
data_filter[['Bulan','Hari','Suhu','Penjualan']].head()
data_filter.loc['2017-01-01'] # Can be a list of index
data_filter.loc['2017-01-01',['Hari','Leaflet','Penjualan','Pendapatan']]
data_filter.iloc[[1,2,3,4,5,6],[1,4,6,7]] # Numeric
# Our filter
monday_index = data_filter.Hari == 'Monday'
# Print the type of variable 'monday_filter' and length
print('Data type: ',type(monday_index),'\n Length of variable: ',len(monday_index))
data_filter[monday_index].head()
data_filter[monday_index]['Hari'].unique() # It make us sure with filter we want
# Our filter
monday_index = data_filter.Hari == 'Monday'
sales_index = data_filter.Penjualan > 15
# Operator & for 'and', | for 'or', ~ for 'not'
monday_sales_index = monday_index & sales_index
data_monday_sales = data_filter[monday_sales_index]
print(data_monday_sales['Penjualan'].describe())
print('Unique Date: ',data_monday_sales['Hari'].unique())
data_filter[data_filter.Hari == 'Monday'].head() # Same with 6.4.1
data_filter[(data_filter.Hari == 'Monday') & (data_filter.Penjualan > 15)].head() # Same with 6.4.2
# Let's say that we want to get all day with sales between 15 and 30 iclusive
sales_index_manual = (data_filter.Penjualan >= 15) & (data_filter.Penjualan <= 30)
data_filter[sales_index_manual].head()
data_filter[sales_index_manual].info()
# With 'between' method
sales_index_between = data_filter.Penjualan.between(15,30)
data_filter[sales_index_between].head()
data_filter[sales_index_between].info()
data_std.to_csv('Standarization Data.csv',index = False,header = True)